Solved import nltk import random # read in the |
您所在的位置:网站首页 › nltk ngram › Solved import nltk import random # read in the |
import nltk import random
# read in the corpus with open('ara_wikipedia_2021_300K-sentences.txt', encoding='utf8') as file: corpus = file.read()
# tokenize the corpus tokens = nltk.word_tokenize(corpus)
# remove stop words stop_words = set(nltk.corpus.stopwords.words('arabic')) tokens = [token for token in tokens if token not in stop_words]
# define the n-gram models to generate n_values = [2, 3, 4, 5, 6] models = {} for n in n_values: models[n] = list(nltk.ngrams(tokens, n))
# define a function to generate text from a given model and starting word def generate_text(model, start_word, num_words): # choose a random n-gram starting with the start_word possibilities = [gram for gram in model if gram[0] == start_word] if not possibilities: return '' current_gram = random.choice(possibilities) output = list(current_gram) # generate the remaining words for i in range(num_words - len(current_gram)): # choose the next word based on the previous n-1 words possibilities = [gram[-1] for gram in model if gram[:-1] == current_gram[1:]] if not possibilities: break next_word = random.choice(possibilities) output.append(next_word) # update the current n-gram current_gram = tuple(list(current_gram)[1:] + [next_word]) return ' '.join(output)
# define a function to prompt the user for input and generate text def prompt_user(): num_words = int(input("Enter the number of words in the desired sentence: ")) start_word = input("Enter one word to start the sentence: ") # choose a random model to use model = random.choice(models[num_words]) return generate_text(model, start_word, num_words)
# test the text generation function with 8 samples samples = [ ('الإسلام', 10), ('الكتاب', 7), ('الثقافة', 8), ('العلوم', 9), ('الفلسفة', 10), ('السياسة', 11), ('التاريخ', 12), ('الفنون', 13) ]
for sample in samples: start_word, num_words = sample print(f"{num_words}-word sentence starting with '{start_word}':") print(prompt_user()) print()
# get the 10 most frequent trigrams and write them to a file trigrams = nltk.ngrams(tokens, 3) freq_dist = nltk.FreqDist(trigrams) top_trigrams = freq_dist.most_common(10) with open('top_trigrams.txt', 'w', encoding='utf8') as file: for trigram, count in top_trigrams: file.write(f"{trigram[0]} {trigram[1]} {trigram[2]}: {count}\n")
------------------------------ Output: 10-word sentence starting with 'الإسلام': الإسلام هو الدين الذي يدعو إلى السلام والتعاون والتسامح والعدل والمساواة والإنصاف والإيثار والإخلاص والإخوانية.
7-word sentence starting with 'الكتاب': الكتاب هو الأداة التي تمكننا من التعلم والتقدم.
8-word sentence starting with 'الثقافة': الثقافة هي العنصر الأساسي في بناء المجتمع وتحقيق التقدم.
9-word sentence starting with 'العلوم': العلوم هي الأداة التي تساعدنا في فهم العالم وتحسين حياتنا وتحقيق التقدم.
10-word sentence starting with 'الفلسفة': الفلسفة هي البحث عن المعنى والغاية في الحياة والكون والإجابة على الأسئلة الأساسية حول الوجود والواقع والمعرفة. ------------------- How do I get the output? not shown to me in this code |
CopyRight 2018-2019 办公设备维修网 版权所有 豫ICP备15022753号-3 |